Load required R packages and scripts.

# source("https://bioconductor.org/biocLite.R")
# biocLite("TCGAbiolinks")
library(TCGAbiolinks)
library(biomaRt)
library(SummarizedExperiment)
library(edgeR)
library(singscore)
library(plyr)
library(DT)

library(survival)
library(ggfortify)
library(RColorBrewer)

source("./script/Survival_analysis.R")

First, we read the expression data into R. We have already downloaded the FPKM RNASeq format of the metastatic SKCM samples from TCGA, and converted it to an R data object and saved it. We load the FPKM level data which is filtered for protein-coding genes. You can read any expression data at this step.

load("./output/TCGA_RNAseq_FPKM_proteinCoding.RData")

Score samples

Now we would like to score samples against several signatures. Here we read in the signatures; these are csv or txt files that have at least a column of gene symbol and and a column of gene direction (if applicable). You can read in your own signatures here if you wish. Then we store gene names using different variable names.

##------- Read in the NK signature 
nk_signature <- read.csv("./data/NK_genes_HGNC.csv", stringsAsFactors = F)
##--- store gene symbol in variable "nk" in R
nk <- as.character(nk_signature$Symbol)

##------- Read in a data frame that has information relate to both Epi and Mes signatures
emt_signature <- read.table("./data/Thiery_EMTsignature_both_tumour_cellLine_EntrezIDs.txt",
                  header = T, sep = "\t")
##--- Extract Epi signature
epi <- emt_signature$HGNC.symbol[emt_signature$epiMes_tumor == "epi"]
epi <- as.character(epi[complete.cases(epi)])
##--- Extract Mes signature
mes <- emt_signature$HGNC.symbol[emt_signature$epiMes_tumor == "mes"]
mes <- as.character(mes[complete.cases(mes)])

##-------- Read in the TGFb-EMT signature
tgfb_signature <- read.table("./data/Foroutan2016_TGFb_EMT_signature_upDown.txt", 
                   header = T, sep = "\t")
##---- Store up- and down- gene sets separately
tgfb_up <- as.character(tgfb_signature$Symbol[tgfb_signature$upDown == "up"])
tgfb_dn <- as.character(tgfb_signature$Symbol[tgfb_signature$upDown == "down"])

The scoring method that we use is a rank-based method; therefore, we rank genes in each sample based on expression aboundance. We use the ranked data in the next step to score samples using singscore method.

exprData <- assay(rnaseq_fpkm)
tcgaRank <- rankGenes(exprData)

Score samples against four signatures: NK, Epi, MEs and TGFb-EMT:

nkScore_tcga <- simpleScore(
  rankData = tcgaRank,
  upSet = nk,
  centerScore = T,
  knownDirection = T
  )
## Warning in singscoring(rankData, upSet = upSet, downSet = downSet,
## subSamples = subSamples, : 1 genes missing: KIR2DS4
epiScore_tcga <- simpleScore(
  rankData = tcgaRank,
  upSet = epi,
  centerScore = T,
  knownDirection = T
  )
## Warning in singscoring(rankData, upSet = upSet, downSet = downSet,
## subSamples = subSamples, : 2 genes missing: C1orf106, OR7E14P
mesScore_tcga <- simpleScore(
  rankData = tcgaRank,
  upSet = mes,
  centerScore = T,
  knownDirection = T
  )
## Warning in singscoring(rankData, upSet = upSet, downSet = downSet,
## subSamples = subSamples, : 4 genes missing: GUCY1B3, KIAA1462, LHFP, PTRF
tgfbScore_tcga <- simpleScore(
  rankData = tcgaRank,
  upSet = tgfb_up,
  downSet = tgfb_dn,
  centerScore = T,
  knownDirection = T
  )
## Warning in singscoring(rankData, upSet = upSet, downSet = downSet,
## subSamples = subSamples, : 12 genes missing: PFTK1, BHLHB2, PSCD1, C5orf13,
## KIAA0692, LOH3CR2A, C4orf18, CCDC99, FLJ10357, CXCR7, FLJ14213, C6orf145
## Warning in singscoring(rankData, downSet, NULL, subSamples, FALSE,
## dispersionFun): 9 genes missing: TACSTD1, SDPR, HRASLS3, KIAA0182,
## FLJ20273, RBM35A, GOLSYN, SQRDL, DEPDC6
epiScore_tcga$sampleID <- 
  mesScore_tcga$sampleID <- 
  tgfbScore_tcga$sampleID <- 
  nkScore_tcga$sampleID <- 
  rnaseq_fpkm$sample

Explore the scores

Landscape plots

Plot landscape of NK scores versus epithelial scores:

plotScoreLandscape(scoredf1 = epiScore_tcga, 
                   scoredf2 = nkScore_tcga, 
                   scorenames = c("Epithelial scores", "NK scores"), 
                   textSize = 1, 
                   isInteractive = T,
                   hexMin = 100)

Plot landscape of NK scores versus Mesenchymal scores:

plotScoreLandscape(scoredf1 = mesScore_tcga, 
                   scoredf2 = nkScore_tcga, 
                   scorenames = c("Mesenchymal scores", "NK scores"), 
                   textSize = 1, 
                   isInteractive = T,
                   hexMin = 100)

Plot landscape of NK scores versus TGFb-EMT scores:

plotScoreLandscape(scoredf1 = tgfbScore_tcga, 
                   scoredf2 = nkScore_tcga, 
                   scorenames = c("TGFb-EMT scores", "NK scores"), 
                   textSize = 1, 
                   isInteractive = T,
                   hexMin = 100)

There is a positive correlation between TGFb-EMT and mesenchymal scores:

plotScoreLandscape(scoredf1 = tgfbScore_tcga, 
                   scoredf2 = mesScore_tcga, 
                   scorenames = c("TGFb-EMT scores", "Mesenchymal scores"), 
                   textSize = 1, 
                   isInteractive = T,
                   hexMin = 100)

We would like to have only one data set for all the scores, so we need t merge them. Here we make the score data sets ready to merge. Basically, we only extract the sampleID and scores and re-name the score column to the signature name that we used to score samples.

nkScore_tcga <- nkScore_tcga[, c("sampleID", "TotalScore")]
colnames(nkScore_tcga)[2] <- "NK_scores"

epiScore_tcga <- epiScore_tcga[, c("sampleID", "TotalScore")]
colnames(epiScore_tcga)[2] <- "Epithelial_scores"

mesScore_tcga <- mesScore_tcga[, c("sampleID", "TotalScore")]
colnames(mesScore_tcga)[2] <- "Mesenchymal_scores"

tgfbScore_tcga <- tgfbScore_tcga[, c("sampleID", "TotalScore")]
colnames(tgfbScore_tcga)[2] <- "TGFbEMT_scores"

Now, we merge all the scores and export them.

multmerge = function(data){
  Reduce(function(x,y) {merge(x, y, by = "sampleID")}, data)
}

allScores <- multmerge(list(nkScore_tcga, 
                            epiScore_tcga, 
                            mesScore_tcga, 
                            tgfbScore_tcga))

# write.csv(allScores, "./output/allScores_MetMelanoma_TCGA_FPKM.csv", 
#           row.names = F)

Look at the merged data

DT::datatable(allScores, filter = "top")
# runApp("./shiny_app")

Signature genes in single samples

We can also look at the samples with highest or lowest scores for a given signature. For example, in the code below, we look at samples with the highest and lowest NK scores. You can change the “nkScore_tcga” with any other score data obtained from simpleScore() function, and accordingly, change the column name, e.g. “NK_scores”.

highScore <- row.names(nkScore_tcga)[nkScore_tcga$NK_scores == max (nkScore_tcga$NK_scores)]
lowScore <- row.names(nkScore_tcga)[nkScore_tcga$NK_scores == min (nkScore_tcga$NK_scores)]

plotRankDensity(rankData = tcgaRank[, highScore, drop = F], 
                upSet = nk, 
                isInteractive = T)
## Warning in plotRankDensity_intl(rankData, upSet = upSet, downSet =
## downSet, : 1 genes missing: KIR2DS4
plotRankDensity(rankData = tcgaRank[, lowScore, drop = F], 
                upSet = nk, 
                isInteractive = T)
## Warning in plotRankDensity_intl(rankData, upSet = upSet, downSet =
## downSet, : 1 genes missing: KIR2DS4

Survival analysis

In this section, we would like to look at the associations between different variables and survival outcome. These variables can be from the below list that stratifies samples for survival analysis:

  • expr : expression of a gene (low, medium, high)
  • score : score of a single signatre (low, medium, high)
  • covariate : age factor by default (low, medium, high)
  • score_expr : stratifies samples based on scores from a signature (high and low) and expression of a gene (high and low)
  • covariate_expr : startifies samples according to covariate (age; high and low) and expression of a gene (high and low)
  • score_covariate : stratifies samples according to scores from a single signature (high and low) and covariate (age; high and low)
  • expr_expr : stratifies samples according to expression of two genes (high gene1/high gene2, high gene1/low gene2, etc)
  • score_score : stratifies samples according to scores of two signatures (high score1/high score2, high score1/low score2, etc)

We can use the function survival_analysis() to plot Kaplan-Meier curves given any of the scenarios mentioned above. The inputs to this functions are:

  1. data : this is the TCGA SKCL data that we downloaded using TCGAbiolink package. This is a specific data object in R that stores expression data as well as several meta-data. Therefore, this function can not take a data frame as input, but instead it takes SummarizedExperiment object.
  2. stratify : one of the above listed options.
  3. scores : with maximum of three columns: one column is called “sample” which has the sample names consistent with sample names in expression data (first argument), and one or two columns of signature scores. An error will be given if the data has more than two score columns. This argument can be NULL if you are not inetersted in the relationship between scores and survival outcome.
  4. gene : names of maximum of two genes; if it is one gene it must be quoted such as “CISH”, and if there are two genes, thy need tp be qouted, separated by comma and put them in paranteses, such as c(“CISH”, “IL15”). This argument can be NULL if you are not inetersted in the relationship between genes and survival outcome.
  5. covariate : this is age by default. This argument can be NULL if you are not inetersted in the relationship between the covariate and survival outcome.

First, we load this function, and prepare different data that we feed into the function:

dat <- rnaseq_fpkm
assay(dat) <- log2(assay(dat) + 1)

colnames(allScores)[1] <- "sample"
nk_tgfb_scores <- allScores[, c("sample", "NK_scores", "TGFbEMT_scores")]

Look at subsets of expression and score data:

datatable(assay(dat)[1:10,1:4], filter = "top")
datatable(nk_tgfb_scores, filter = "top")

Survival curves in Figure 2

First we reproduce the survival curves in figure 2 of the paper. These include the associations between age or age/expresssion of some genes with survival.

checkGenes <- c("IFNG", "KLRD1", "IL15", "B2M")

survival_analysis (
  data = dat,
  stratify = "covariate",
  scores = NULL,
  gene = NULL,
  covariate = "age_at_diagnosis"
  )

for(g in checkGenes){
  print(survival_analysis (
  data = dat,
  stratify = "covariate_expr",
  scores = NULL,
  gene = g,
  covariate = "age_at_diagnosis"
  ))
}

Survival curves in Figure 4

survival_analysis (
  data = dat,
  stratify = "score",
  scores = nk_tgfb_scores,
  gene = NULL,
  covariate = NULL
  )

# survival_analysis (
#   data = dat,
#   stratify = "expr",
#   scores = NULL,
#   gene = "XCL1",
#   covariate = NULL
#   )

checkGenes <- c("XCL1", "CCL5", "GZMB", "FAS", "CD96", "CD3E", "CD8B", "IL15", "IL15RA")

for(g in checkGenes){
  print(survival_analysis (
  data = dat,
  stratify = "expr",
  scores = NULL,
  gene = g,
  covariate = NULL
  ))
}

survival_analysis (
  data = dat,
  stratify = "expr_expr",
  scores = NULL,
  gene = c("IL15", "CISH"),
  covariate = NULL
  )  

survival_analysis (
  data = dat,
  stratify = "score_expr",
  scores = nk_tgfb_scores,
  gene = "CISH",
  covariate = NULL
  )  

## You can use the code below to save any single plot separately; simply run the code below after running the survival analysis for one scenario and make sure that each time you change the name of the exported file.
# ggsave(
#   paste0("survival_analysis_IL15_CISH.png"),
#   device = "png",
#   path = "./figure/",
#   width = 6,
#   height = 4,
#   units = "in"
#   )

Survival curves in Figure 5

For this figure, we have partitioned samples into two groups: young and old, then stratify patients according to NK score and TGFb-EMT score.

medAge <- round(median(colData(dat)$age_at_diagnosis/365, na.rm = T), 1)
dat2 <- dat[, complete.cases(colData(dat)$age_at_diagnosis)]
dat_young <- dat2[, colData(dat2)$age_at_diagnosis/365 < medAge ]
dat_old <- dat2[, colData(dat2)$age_at_diagnosis/365 > medAge ]


survival_analysis (
  data = dat_young,
  stratify = "score_score",
  scores = nk_tgfb_scores,
  gene = NULL,
  covariate = NULL
  )

survival_analysis (
  data = dat_old,
  stratify = "score_score",
  scores = nk_tgfb_scores,
  gene = NULL,
  covariate = NULL
  )

```